Input data

bike_share <- read_csv("bike_sharing_dataset.csv")
## Rows: 2922 Columns: 29── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (28): temp_avg, temp_min, temp_max, temp_observ, precip, wind, wt_fog, ...
## date  (1): date
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(space_missions)
#View(bike_share)

EDA

total_cust = bike_share$total_cust
str(bike_share)
## spec_tbl_df [2,922 × 29] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ date             : Date[1:2922], format: "2011-01-01" "2011-01-02" ...
##  $ temp_avg         : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
##  $ temp_min         : num [1:2922] -1.57 0.88 -3.44 -5.96 -4.29 ...
##  $ temp_max         : num [1:2922] 11.97 13.81 7.46 4.64 6.11 ...
##  $ temp_observ      : num [1:2922] 2.77 7.33 -3.06 -3.1 -1.77 ...
##  $ precip           : num [1:2922] 0.0693 1.0373 1.8788 0 0 ...
##  $ wind             : num [1:2922] 2.58 3.92 3.62 1.8 2.95 ...
##  $ wt_fog           : num [1:2922] 1 1 NA NA NA NA 1 1 NA NA ...
##  $ wt_heavy_fog     : num [1:2922] NA 1 NA NA NA NA NA 1 NA NA ...
##  $ wt_thunder       : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
##  $ wt_sleet         : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
##  $ wt_hail          : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
##  $ wt_glaze         : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
##  $ wt_haze          : num [1:2922] 1 NA NA NA NA NA 1 1 NA NA ...
##  $ wt_drift_snow    : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
##  $ wt_high_wind     : num [1:2922] NA NA NA NA NA NA NA NA 1 1 ...
##  $ wt_mist          : num [1:2922] 1 1 NA NA NA NA 1 1 NA NA ...
##  $ wt_drizzle       : num [1:2922] NA 1 NA NA NA NA NA NA NA NA ...
##  $ wt_rain          : num [1:2922] 1 1 NA NA NA NA NA NA NA NA ...
##  $ wt_freeze_rain   : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
##  $ wt_snow          : num [1:2922] NA NA NA NA NA 1 1 1 NA NA ...
##  $ wt_ground_fog    : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
##  $ wt_ice_fog       : num [1:2922] NA NA NA NA NA NA NA 1 NA NA ...
##  $ wt_freeze_drizzle: num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
##  $ wt_unknown       : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
##  $ casual           : num [1:2922] 330 130 120 107 82 88 148 68 54 41 ...
##  $ registered       : num [1:2922] 629 651 1181 1429 1489 ...
##  $ total_cust       : num [1:2922] 959 781 1301 1536 1571 ...
##  $ holiday          : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   date = col_date(format = ""),
##   ..   temp_avg = col_double(),
##   ..   temp_min = col_double(),
##   ..   temp_max = col_double(),
##   ..   temp_observ = col_double(),
##   ..   precip = col_double(),
##   ..   wind = col_double(),
##   ..   wt_fog = col_double(),
##   ..   wt_heavy_fog = col_double(),
##   ..   wt_thunder = col_double(),
##   ..   wt_sleet = col_double(),
##   ..   wt_hail = col_double(),
##   ..   wt_glaze = col_double(),
##   ..   wt_haze = col_double(),
##   ..   wt_drift_snow = col_double(),
##   ..   wt_high_wind = col_double(),
##   ..   wt_mist = col_double(),
##   ..   wt_drizzle = col_double(),
##   ..   wt_rain = col_double(),
##   ..   wt_freeze_rain = col_double(),
##   ..   wt_snow = col_double(),
##   ..   wt_ground_fog = col_double(),
##   ..   wt_ice_fog = col_double(),
##   ..   wt_freeze_drizzle = col_double(),
##   ..   wt_unknown = col_double(),
##   ..   casual = col_double(),
##   ..   registered = col_double(),
##   ..   total_cust = col_double(),
##   ..   holiday = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(bike_share)
##       date               temp_avg          temp_min           temp_max    
##  Min.   :2011-01-01   Min.   :-12.100   Min.   :-16.9937   Min.   :-7.98  
##  1st Qu.:2012-12-31   1st Qu.:  6.567   1st Qu.:  0.5165   1st Qu.:11.08  
##  Median :2014-12-31   Median : 15.433   Median :  8.5049   Median :19.99  
##  Mean   :2014-12-31   Mean   : 14.419   Mean   :  8.5065   Mean   :19.02  
##  3rd Qu.:2016-12-30   3rd Qu.: 23.067   3rd Qu.: 17.3384   3rd Qu.:27.87  
##  Max.   :2018-12-31   Max.   : 31.733   Max.   : 26.2063   Max.   :37.85  
##                       NA's   :821                                         
##   temp_observ          precip               wind            wt_fog    
##  Min.   :-15.658   Min.   :  0.00000   Min.   : 0.375   Min.   :1     
##  1st Qu.:  3.013   1st Qu.:  0.00551   1st Qu.: 2.200   1st Qu.:1     
##  Median : 11.619   Median :  0.27150   Median : 2.900   Median :1     
##  Mean   : 11.069   Mean   :  3.43573   Mean   : 3.163   Mean   :1     
##  3rd Qu.: 19.767   3rd Qu.:  2.88538   3rd Qu.: 3.875   3rd Qu.:1     
##  Max.   : 28.667   Max.   :118.78980   Max.   :12.750   Max.   :1     
##                                                         NA's   :1419  
##   wt_heavy_fog    wt_thunder      wt_sleet       wt_hail        wt_glaze   
##  Min.   :1      Min.   :1      Min.   :1      Min.   :1      Min.   :1     
##  1st Qu.:1      1st Qu.:1      1st Qu.:1      1st Qu.:1      1st Qu.:1     
##  Median :1      Median :1      Median :1      Median :1      Median :1     
##  Mean   :1      Mean   :1      Mean   :1      Mean   :1      Mean   :1     
##  3rd Qu.:1      3rd Qu.:1      3rd Qu.:1      3rd Qu.:1      3rd Qu.:1     
##  Max.   :1      Max.   :1      Max.   :1      Max.   :1      Max.   :1     
##  NA's   :2714   NA's   :2228   NA's   :2793   NA's   :2872   NA's   :2769  
##     wt_haze     wt_drift_snow   wt_high_wind     wt_mist       wt_drizzle  
##  Min.   :1      Min.   :1      Min.   :1      Min.   :1      Min.   :1     
##  1st Qu.:1      1st Qu.:1      1st Qu.:1      1st Qu.:1      1st Qu.:1     
##  Median :1      Median :1      Median :1      Median :1      Median :1     
##  Mean   :1      Mean   :1      Mean   :1      Mean   :1      Mean   :1     
##  3rd Qu.:1      3rd Qu.:1      3rd Qu.:1      3rd Qu.:1      3rd Qu.:1     
##  Max.   :1      Max.   :1      Max.   :1      Max.   :1      Max.   :1     
##  NA's   :2217   NA's   :2915   NA's   :2664   NA's   :2551   NA's   :2794  
##     wt_rain     wt_freeze_rain    wt_snow     wt_ground_fog    wt_ice_fog  
##  Min.   :1      Min.   :1      Min.   :1      Min.   :1      Min.   :1     
##  1st Qu.:1      1st Qu.:1      1st Qu.:1      1st Qu.:1      1st Qu.:1     
##  Median :1      Median :1      Median :1      Median :1      Median :1     
##  Mean   :1      Mean   :1      Mean   :1      Mean   :1      Mean   :1     
##  3rd Qu.:1      3rd Qu.:1      3rd Qu.:1      3rd Qu.:1      3rd Qu.:1     
##  Max.   :1      Max.   :1      Max.   :1      Max.   :1      Max.   :1     
##  NA's   :2516   NA's   :2917   NA's   :2838   NA's   :2886   NA's   :2912  
##  wt_freeze_drizzle   wt_unknown       casual          registered   
##  Min.   :1         Min.   :1      Min.   :    2.0   Min.   :   19  
##  1st Qu.:1         1st Qu.:1      1st Qu.:  512.2   1st Qu.: 3839  
##  Median :1         Median :1      Median : 1220.5   Median : 5964  
##  Mean   :1         Mean   :1      Mean   : 1679.8   Mean   : 6046  
##  3rd Qu.:1         3rd Qu.:1      3rd Qu.: 2357.2   3rd Qu.: 8188  
##  Max.   :1         Max.   :1      Max.   :10173.0   Max.   :15419  
##  NA's   :2918      NA's   :2921   NA's   :4         NA's   :4      
##    total_cust       holiday    
##  Min.   :   21   Min.   :1     
##  1st Qu.: 4628   1st Qu.:1     
##  Median : 7442   Median :1     
##  Mean   : 7726   Mean   :1     
##  3rd Qu.:10850   3rd Qu.:1     
##  Max.   :19113   Max.   :1     
##  NA's   :4       NA's   :2833
na_index = list(which(is.na(bike_share$total_cust), arr.ind=TRUE))
na_index
## [[1]]
## [1] 1849 1850 1851 1852

There are 4 continous days that we have missing data in total_cust

tsplot(total_cust[1800:1900])

Thus, I will fill those values with middle values between values at indices 1848 and 1853.

temp = (total_cust[[1848]] - total_cust[[1853]] )
temp
## [1] 2457
bike_share$total_cust[1849] <- total_cust[[1848]] - (1/5)*temp
bike_share$total_cust[1850] <- total_cust[[1848]] - 2*(1/5)*temp
bike_share$total_cust[1851] <- total_cust[[1848]] - 3*(1/5)*temp
bike_share$total_cust[1852] <- total_cust[[1848]] - 4*(1/5)*temp
total_cust <- bike_share$total_cust
tsplot(total_cust[1800:1900])

which(is.na(bike_share$total_cust), arr.ind=TRUE)
## integer(0)

Now there is no more NA value in total_cust

ggplot( data = bike_share, aes( date, total_cust )) + 
  geom_line() +
  ylab('Total Customers') +
  xlab('Time') +
  theme_minimal()

tsplot(bike_share$total_cust)

Use log version

log_total_cust <- log(bike_share$total_cust)
tsplot(log_total_cust)

ggplot( data = bike_share, aes( date, log_total_cust )) + 
  geom_line() +
  ylab('Transformed Total Customers') +
  xlab('Time') +
  theme_minimal()

Non Parametric Trend

Smooth to with frequency = 7

cust_2week = ts(log_total_cust, freq=7)
tsplot(cust_2week, col=8, ylab = 'Total Customers') # the time scale matters (not shown)
lines(ksmooth(time(cust_2week), cust_2week, "normal", bandwidth=12), lwd=2, col=4)

Smooth to with frequency = 14

cust_2week = ts(log_total_cust, freq=14)
tsplot(cust_2week, col=8, ylab = 'Total Customers') # the time scale matters (not shown)
lines(ksmooth(time(cust_2week), cust_2week, "normal", bandwidth=12), lwd=2, col=4)

Smooth to with frequency = 28

cust_2week = ts(log_total_cust, freq=28)
tsplot(cust_2week, col=8, ylab = 'Total Customers') # the time scale matters (not shown)
lines(ksmooth(time(cust_2week), cust_2week, "normal", bandwidth=12), lwd=2, col=4)

Decomposition

sub1_ts <- ts(log_total_cust, start=c(2011,1), end=c(2018,365), frequency=365)
cust.dec = decompose(sub1_ts)
plot(cust.dec)

sub1_ts <- ts(cust_2week, start=c(2011,1), end=c(2018,365), frequency=365)
cust.dec = decompose(sub1_ts)
plot(cust.dec)

Fitting SARIMA Model

Now we check the detrended and differenced time series to see which one is stationary

par(mfrow=2:1) # plot transformed data
tsplot(log(total_cust), main="log" )
tsplot(diff(total_cust), main="differenced" )

Differenced time series looks stationary.

tsplot(diff(log_total_cust))

summary(ur.ers(diff(log_total_cust)))
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3195 -0.0392  0.1196  0.2621  3.3528 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.50388    0.03496  -14.41   <2e-16 ***
## yd.diff.lag1 -0.63055    0.03396  -18.57   <2e-16 ***
## yd.diff.lag2 -0.57342    0.03166  -18.11   <2e-16 ***
## yd.diff.lag3 -0.41255    0.02679  -15.40   <2e-16 ***
## yd.diff.lag4 -0.16818    0.01830   -9.19   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4282 on 2911 degrees of freedom
## Multiple R-squared:  0.5772, Adjusted R-squared:  0.5764 
## F-statistic: 794.7 on 5 and 2911 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -14.4112 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
adf.test(diff(log_total_cust))
## Warning in adf.test(diff(log_total_cust)): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(log_total_cust)
## Dickey-Fuller = -22.058, Lag order = 14, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(log_total_cust))
## Warning in pp.test(diff(log_total_cust)): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff(log_total_cust)
## Dickey-Fuller Z(alpha) = -2851.4, Truncation lag parameter = 9, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(diff(log_total_cust))
## Warning in kpss.test(diff(log_total_cust)): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  diff(log_total_cust)
## KPSS Level = 0.028709, Truncation lag parameter = 9, p-value = 0.1

After 4 tests, I am confident that the differenced time series is stationary.

mod1 <- Arima(log_total_cust, order = c(0,1,0))
plot(mod1$residuals)

acf(mod1$residuals)

pacf(mod1$residuals)

mod1
## Series: log_total_cust 
## ARIMA(0,1,0) 
## 
## sigma^2 = 0.1654:  log likelihood = -1516.44
## AIC=3034.88   AICc=3034.88   BIC=3040.86
mod1 <- Arima(log_total_cust, order = c(1,1,0))
plot(mod1$residuals)

acf(mod1$residuals)

pacf(mod1$residuals)

mod1
## Series: log_total_cust 
## ARIMA(1,1,0) 
## 
## Coefficients:
##           ar1
##       -0.3081
## s.e.   0.0176
## 
## sigma^2 = 0.1497:  log likelihood = -1370.88
## AIC=2745.76   AICc=2745.76   BIC=2757.71
mod1 <- Arima(log_total_cust, order = c(1,1,1))
plot(mod1$residuals)

acf(mod1$residuals)

pacf(mod1$residuals)

mod1
## Series: log_total_cust 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.3602  -0.9054
## s.e.  0.0208   0.0090
## 
## sigma^2 = 0.1234:  log likelihood = -1087.92
## AIC=2181.85   AICc=2181.85   BIC=2199.78
auto.arima(log_total_cust)
## Series: log_total_cust 
## ARIMA(3,1,2) 
## 
## Coefficients:
##          ar1     ar2      ar3      ma1      ma2
##       -0.511  0.3065  -0.0382  -0.0285  -0.7842
## s.e.   0.058  0.0323   0.0208   0.0552   0.0509
## 
## sigma^2 = 0.1231:  log likelihood = -1083.78
## AIC=2179.56   AICc=2179.59   BIC=2215.44
mod1 <- Arima(log_total_cust, order = c(3,1,2))
plot(mod1$residuals)

acf(mod1$residuals)

pacf(mod1$residuals)

coeftest(mod1)
## 
## z test of coefficients:
## 
##      Estimate Std. Error  z value Pr(>|z|)    
## ar1 -0.511038   0.058015  -8.8087  < 2e-16 ***
## ar2  0.306507   0.032295   9.4908  < 2e-16 ***
## ar3 -0.038163   0.020764  -1.8379  0.06608 .  
## ma1 -0.028493   0.055182  -0.5163  0.60561    
## ma2 -0.784198   0.050940 -15.3946  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1
## Series: log_total_cust 
## ARIMA(3,1,2) 
## 
## Coefficients:
##          ar1     ar2      ar3      ma1      ma2
##       -0.511  0.3065  -0.0382  -0.0285  -0.7842
## s.e.   0.058  0.0323   0.0208   0.0552   0.0509
## 
## sigma^2 = 0.1231:  log likelihood = -1083.78
## AIC=2179.56   AICc=2179.59   BIC=2215.44
mod1 <- Arima(log_total_cust, order = c(1,1,1))
plot(mod1$residuals)

acf(mod1$residuals)

pacf(mod1$residuals)

coeftest(mod1)
## 
## z test of coefficients:
## 
##       Estimate Std. Error  z value  Pr(>|z|)    
## ar1  0.3601614  0.0207536   17.354 < 2.2e-16 ***
## ma1 -0.9053887  0.0089533 -101.123 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1
## Series: log_total_cust 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.3602  -0.9054
## s.e.  0.0208   0.0090
## 
## sigma^2 = 0.1234:  log likelihood = -1087.92
## AIC=2181.85   AICc=2181.85   BIC=2199.78

Best ARIMA model:

mod1 <- Arima(log_total_cust, order = c(1,1,1))
plot(mod1$residuals)

acf(mod1$residuals)

pacf(mod1$residuals)

coeftest(mod1)
## 
## z test of coefficients:
## 
##       Estimate Std. Error  z value  Pr(>|z|)    
## ar1  0.3601614  0.0207536   17.354 < 2.2e-16 ***
## ma1 -0.9053887  0.0089533 -101.123 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1
## Series: log_total_cust 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.3602  -0.9054
## s.e.  0.0208   0.0090
## 
## sigma^2 = 0.1234:  log likelihood = -1087.92
## AIC=2181.85   AICc=2181.85   BIC=2199.78
#stationary tests for residuals
adf.test(mod1$residuals)
## Warning in adf.test(mod1$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod1$residuals
## Dickey-Fuller = -14.095, Lag order = 14, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod1$residuals)
## Warning in pp.test(mod1$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod1$residuals
## Dickey-Fuller Z(alpha) = -2898.9, Truncation lag parameter = 9, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod1$residuals)
## Warning in kpss.test(mod1$residuals): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod1$residuals
## KPSS Level = 0.19141, Truncation lag parameter = 9, p-value = 0.1
summary(ur.ers(mod1$residuals))
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.6054 -0.0820  0.0471  0.1573  1.2879 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -1.03487    0.04154 -24.912   <2e-16 ***
## yd.diff.lag1  0.03902    0.03720   1.049   0.2943    
## yd.diff.lag2  0.03323    0.03205   1.037   0.2999    
## yd.diff.lag3  0.01562    0.02616   0.597   0.5505    
## yd.diff.lag4  0.03633    0.01855   1.958   0.0503 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3511 on 2912 degrees of freedom
## Multiple R-squared:  0.4992, Adjusted R-squared:  0.4984 
## F-statistic: 580.7 on 5 and 2912 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -24.9122 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62

Model utility tests:

checkresiduals(mod1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)
## Q* = 15.324, df = 8, p-value = 0.05314
## 
## Model df: 2.   Total lags used: 10
tsdiag(mod1) # looks good

#mod1 = sarima(total_cust,p=1,d=1,q=1,P=3,D=0,Q=0,S=7)

densityplot(as.numeric(mod1$residuals)) # these look uniform

sarima(log_total_cust, 1,1,1)
## initial  value -0.899661 
## iter   2 value -0.953066
## iter   3 value -0.971487
## iter   4 value -0.981832
## iter   5 value -1.004571
## iter   6 value -1.020625
## iter   7 value -1.033720
## iter   8 value -1.037279
## iter   9 value -1.041903
## iter  10 value -1.045136
## iter  11 value -1.045558
## iter  12 value -1.045578
## iter  13 value -1.045589
## iter  14 value -1.045592
## iter  15 value -1.045592
## iter  16 value -1.045592
## iter  16 value -1.045592
## iter  16 value -1.045592
## final  value -1.045592 
## converged
## initial  value -1.046514 
## iter   2 value -1.046520
## iter   3 value -1.046521
## iter   4 value -1.046522
## iter   5 value -1.046522
## iter   5 value -1.046522
## iter   5 value -1.046522
## final  value -1.046522 
## converged

## $fit
## 
## Call:
## arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D, Q), period = S), 
##     xreg = constant, transform.pars = trans, fixed = fixed, optim.control = list(trace = trc, 
##         REPORT = 1, reltol = tol))
## 
## Coefficients:
##          ar1      ma1  constant
##       0.3603  -0.9055     4e-04
## s.e.  0.0208   0.0090     1e-03
## 
## sigma^2 estimated as 0.1233:  log likelihood = -1087.83,  aic = 2183.66
## 
## $degrees_of_freedom
## [1] 2918
## 
## $ttable
##          Estimate     SE   t.value p.value
## ar1        0.3603 0.0208   17.3607  0.0000
## ma1       -0.9055 0.0090 -101.1686  0.0000
## constant   0.0004 0.0010    0.4335  0.6646
## 
## $AIC
## [1] 0.7475719
## 
## $AICc
## [1] 0.7475747
## 
## $BIC
## [1] 0.7557605
predictions_sarima = predict(mod1,n.ahead=60)
predictions_sarima 
## $pred
## Time Series:
## Start = 2923 
## End = 2982 
## Frequency = 1 
##  [1] 8.100830 8.215069 8.256213 8.271032 8.276369 8.278291 8.278984 8.279233
##  [9] 8.279323 8.279355 8.279367 8.279371 8.279373 8.279373 8.279373 8.279373
## [17] 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373
## [25] 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373
## [33] 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373
## [41] 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373
## [49] 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373
## [57] 8.279373 8.279373 8.279373 8.279373
## 
## $se
## Time Series:
## Start = 2923 
## End = 2982 
## Frequency = 1 
##  [1] 0.3512246 0.3858388 0.3963692 0.4018129 0.4058315 0.4093753 0.4127392
##  [8] 0.4160232 0.4192628 0.4224710 0.4256525 0.4288096 0.4319433 0.4350544
## [15] 0.4381433 0.4412106 0.4442567 0.4472820 0.4502870 0.4532721 0.4562377
## [22] 0.4591841 0.4621117 0.4650209 0.4679120 0.4707854 0.4736413 0.4764801
## [29] 0.4793021 0.4821076 0.4848968 0.4876701 0.4904277 0.4931699 0.4958969
## [36] 0.4986091 0.5013065 0.5039895 0.5066583 0.5093131 0.5119541 0.5145816
## [43] 0.5171958 0.5197968 0.5223848 0.5249601 0.5275228 0.5300731 0.5326112
## [50] 0.5351373 0.5376515 0.5401540 0.5426450 0.5451246 0.5475929 0.5500502
## [57] 0.5524965 0.5549321 0.5573570 0.5597714

Forecasting:

forecastArea <- forecast(exp(mod1$fitted), h = 365)
plot(forecastArea,lwd=2,col="purple", main="Forecasts from SARIMA(1,1,1)", xlab="Time", ylab="Number of customers") 
legend("topleft", legend=c("Past", "Future"), col=c("Purple", "Blue"), lty=1:2, cex=0.8) 

Xreg with temp_max, precipitation, and wind

Check NA values in columns temp_max

na_index = list(which(is.na(bike_share$temp_max), arr.ind=TRUE))
na_index
## [[1]]
## integer(0)

precipitation:

na_index = list(which(is.na(bike_share$precip), arr.ind=TRUE))
na_index
## [[1]]
## integer(0)

wind:

na_index = list(which(is.na(bike_share$wind), arr.ind=TRUE))
na_index
## [[1]]
## integer(0)

We transform the temperature from Celsius to Farenheit to avoid NAs when using log transformation on negative values

temp_max = bike_share$temp_max * 1.8 + 32
na_index = list(which(is.na(log(temp_max)), arr.ind=TRUE))
na_index
## [[1]]
## integer(0)

We transform the precipitation to increase by 1 so we can use log transformation

na_index = list(which(is.na(log(temp_max)), arr.ind=TRUE))
na_index
## [[1]]
## integer(0)

Transforming predictors

hol = bike_share$holiday
# use is.na() to identify NA values in the list
na_values <- is.na(hol)
# use ifelse() to replace NA values with 0
hol1 <- ifelse(na_values, 0, hol)  #for latter models
holiday = log(hol1 + 1)
temp_max = log(temp_max)
log_temp_max = log(temp_max)
precip = log(bike_share$precip + 1)
wind = log(bike_share$wind)
tsplot(temp_max)

tsplot(precip)

tsplot(wind)

plot(log_total_cust,temp_max)

plot(log_total_cust,precip)

plot(log_total_cust,wind)

na_index = list(which(is.na(precip), arr.ind=TRUE))
na_index
## [[1]]
## integer(0)
length(log_total_cust)
## [1] 2922
#na_or_inf = is.na(xreg) | is.infinite(xreg)
# prepare vector of our independent x-variables
xreg <- cbind(temp_max, precip, wind)

fit <- auto.arima(log_total_cust, xreg = xreg)
checkresiduals(fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(3,1,2) errors
## Q* = 19.44, df = 5, p-value = 0.001591
## 
## Model df: 5.   Total lags used: 10
sarima(log_total_cust, 3,1,2, xreg = xreg)
## initial  value -0.928119 
## iter   2 value -1.053179
## iter   3 value -1.090721
## iter   4 value -1.106148
## iter   5 value -1.117877
## iter   6 value -1.120666
## iter   7 value -1.125214
## iter   8 value -1.128461
## iter   9 value -1.130893
## iter  10 value -1.131619
## iter  11 value -1.132809
## iter  12 value -1.133803
## iter  13 value -1.136221
## iter  14 value -1.137416
## iter  15 value -1.138183
## iter  16 value -1.138604
## iter  17 value -1.138917
## iter  18 value -1.139127
## iter  19 value -1.139186
## iter  20 value -1.139265
## iter  21 value -1.139352
## iter  22 value -1.139393
## iter  23 value -1.139421
## iter  24 value -1.139459
## iter  25 value -1.139462
## iter  26 value -1.139464
## iter  27 value -1.139468
## iter  28 value -1.139475
## iter  29 value -1.139484
## iter  30 value -1.139488
## iter  31 value -1.139490
## iter  32 value -1.139491
## iter  33 value -1.139493
## iter  34 value -1.139496
## iter  35 value -1.139507
## iter  36 value -1.139508
## iter  37 value -1.139513
## iter  38 value -1.139518
## iter  39 value -1.139521
## iter  40 value -1.139522
## iter  41 value -1.139531
## iter  42 value -1.139537
## iter  43 value -1.139545
## iter  44 value -1.139553
## iter  45 value -1.139574
## iter  46 value -1.139650
## iter  47 value -1.139664
## iter  48 value -1.139695
## iter  49 value -1.139732
## iter  50 value -1.139833
## iter  51 value -1.139932
## iter  52 value -1.140018
## iter  53 value -1.140025
## iter  54 value -1.140036
## iter  55 value -1.140043
## iter  56 value -1.140059
## iter  57 value -1.140067
## iter  58 value -1.140081
## iter  59 value -1.140101
## iter  60 value -1.140121
## iter  61 value -1.140130
## iter  62 value -1.140133
## iter  63 value -1.140134
## iter  64 value -1.140134
## iter  65 value -1.140135
## iter  66 value -1.140135
## iter  67 value -1.140136
## iter  68 value -1.140137
## iter  69 value -1.140137
## iter  69 value -1.140137
## final  value -1.140137 
## converged
## initial  value -1.139132 
## iter   2 value -1.139134
## iter   3 value -1.139138
## iter   4 value -1.139138
## iter   5 value -1.139139
## iter   6 value -1.139141
## iter   7 value -1.139143
## iter   8 value -1.139144
## iter   9 value -1.139145
## iter  10 value -1.139145
## iter  11 value -1.139145
## iter  12 value -1.139146
## iter  13 value -1.139146
## iter  14 value -1.139146
## iter  14 value -1.139146
## iter  14 value -1.139146
## final  value -1.139146 
## converged

## $fit
## 
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D, 
##     Q), period = S), xreg = xreg, transform.pars = trans, fixed = fixed, optim.control = list(trace = trc, 
##     REPORT = 1, reltol = tol))
## 
## Coefficients:
##           ar1     ar2     ar3      ma1      ma2  temp_max   precip     wind
##       -0.6848  0.2155  0.0266  -0.0608  -0.8048    0.8707  -0.0972  -0.1181
## s.e.   0.0677  0.0302  0.0223   0.0646   0.0610    0.0469   0.0066   0.0152
## 
## sigma^2 estimated as 0.1024:  log likelihood = -817.27,  aic = 1652.55
## 
## $degrees_of_freedom
## [1] 2913
## 
## $ttable
##          Estimate     SE  t.value p.value
## ar1       -0.6848 0.0677 -10.1183  0.0000
## ar2        0.2155 0.0302   7.1398  0.0000
## ar3        0.0266 0.0223   1.1898  0.2342
## ma1       -0.0608 0.0646  -0.9411  0.3467
## ma2       -0.8048 0.0610 -13.1964  0.0000
## temp_max   0.8707 0.0469  18.5500  0.0000
## precip    -0.0972 0.0066 -14.6585  0.0000
## wind      -0.1181 0.0152  -7.7902  0.0000
## 
## $AIC
## [1] 0.565748
## 
## $AICc
## [1] 0.5657649
## 
## $BIC
## [1] 0.5841722

Spectrum

mvspec(log_total_cust, col=4, lwd=1)

mvspec(log_total_cust, col=4, lwd=1, log='y')

nonparametric spectral estimation procedure Smoothing -> Better CI

mvspec(log_total_cust, spans = 2, col=4, lwd=1)

mvspec(log_total_cust, spans = 2, col=4, lwd=1, log='y')

Fit an autoregressive spectral estimator to the sunspot data using spec.ar(). Display the parametric spectral estimate curve and the non-parametric one on the same graph, and write a sentence to compare them.

Find the biggest frequency: (7, 8, 9)

cust_spec = mvspec(log_total_cust, spans = 2, col=4, lwd=1)

which(cust_spec$spec>50)
## [1] 8
which(cust_spec$spec>40)
## [1] 8 9
which(cust_spec$spec>30)
## [1] 7 8 9
# The important frequency is spot 8 and 9
cust_spec$spec[8]
## [1] 73.00253
cust_spec$spec[9]
## [1] 42.83057
# Frequencies
cust_spec$freq[8] 
## [1] 0.002666667
cust_spec$freq[9] 
## [1] 0.003
# Periods
1/cust_spec$freq[8]
## [1] 375
1/cust_spec$freq[9]
## [1] 333.3333
t = time(bike_share$date)

omega1 = cust_spec$freq[8]


# Let's see how much omega1 explains
Xcos = cos(2*pi*t*omega1)
Xsin = sin(2*pi*t*omega1)


modC = lm(log_total_cust ~ log(t) + Xcos+Xsin)
summary(modC) # adjusted R^2 of 0.276
## 
## Call:
## lm(formula = log_total_cust ~ log(t) + Xcos + Xsin)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2992 -0.1246  0.0559  0.2385  1.0542 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.205848   0.054571  113.72   <2e-16 ***
## log(t)       0.368908   0.007739   47.67   <2e-16 ***
## Xcos        -0.397353   0.010843  -36.65   <2e-16 ***
## Xsin         0.187456   0.010819   17.33   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4121 on 2918 degrees of freedom
## Multiple R-squared:  0.5788, Adjusted R-squared:  0.5784 
## F-statistic:  1337 on 3 and 2918 DF,  p-value: < 2.2e-16
plot(modC,which=1) # looks good

tsplot(modC$residuals) # looks good

histfit=ts(predict(modC),start=1)
preddf = cbind(log_total_cust, histfit)
plot(exp(preddf), plot.type="single", col = c("black","red"))

t = time(bike_share$date)

omega1 = cust_spec$freq[8]
omega2 = cust_spec$freq[9]

# Let's see how much omega1 and omega2 explains
Xcos = cos(2*pi*t*omega1)
Xsin = sin(2*pi*t*omega1)
Xcos2 = cos(2*pi*t*omega2)
Xsin2 = sin(2*pi*t*omega2)

modC = lm(log_total_cust ~  log(t) + Xcos+Xsin +Xcos2+Xsin2)
summary(modC) 
## 
## Call:
## lm(formula = log_total_cust ~ log(t) + Xcos + Xsin + Xcos2 + 
##     Xsin2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4430 -0.1350  0.0609  0.2312  0.9451 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.168969   0.053330 115.675   <2e-16 ***
## log(t)       0.374606   0.007564  49.528   <2e-16 ***
## Xcos        -0.392065   0.010533 -37.222   <2e-16 ***
## Xsin         0.185559   0.010508  17.658   <2e-16 ***
## Xcos2        0.141027   0.010509  13.419   <2e-16 ***
## Xsin2       -0.012495   0.010513  -1.189    0.235    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4 on 2916 degrees of freedom
## Multiple R-squared:  0.6034, Adjusted R-squared:  0.6027 
## F-statistic: 887.4 on 5 and 2916 DF,  p-value: < 2.2e-16
plot(modC,which=1) 

tsplot(modC$residuals) 

histfit=ts(predict(modC),start=1)
preddf = cbind(log_total_cust, histfit)
plot(exp(preddf), plot.type="single", col = c("black","red"))

Mixed Model:

t = time(bike_share$date)

omega1 = cust_spec$freq[8]
omega2 = cust_spec$freq[9]
omega3 = cust_spec$freq[7]


Xcos = cos(2*pi*t*omega1)
Xsin = sin(2*pi*t*omega1)
Xcos2 = cos(2*pi*t*omega2)
Xsin2 = sin(2*pi*t*omega2)
Xcos3 = cos(2*pi*t*omega3)
Xsin3 = sin(2*pi*t*omega3)

modC = lm(log_total_cust ~  log(t) + Xcos+Xsin +Xcos2+temp_max+ precip + wind + holiday)
summary(modC) 
## 
## Call:
## lm(formula = log_total_cust ~ log(t) + Xcos + Xsin + Xcos2 + 
##     temp_max + precip + wind + holiday)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9342 -0.1298  0.0335  0.1934  0.9284 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.608141   0.152241  17.132  < 2e-16 ***
## log(t)       0.366204   0.006341  57.748  < 2e-16 ***
## Xcos        -0.100327   0.014559  -6.891 6.77e-12 ***
## Xsin         0.105084   0.009757  10.771  < 2e-16 ***
## Xcos2        0.057372   0.009357   6.132 9.87e-10 ***
## temp_max     0.921385   0.035836  25.711  < 2e-16 ***
## precip      -0.127907   0.006309 -20.275  < 2e-16 ***
## wind        -0.086461   0.015245  -5.671 1.56e-08 ***
## holiday     -0.391616   0.052179  -7.505 8.10e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3352 on 2913 degrees of freedom
## Multiple R-squared:  0.7219, Adjusted R-squared:  0.7211 
## F-statistic:   945 on 8 and 2913 DF,  p-value: < 2.2e-16
plot(modC,which=1) 

tsplot(modC$residuals) 

histfit=ts(predict(modC),start=1)
preddf = cbind(log_total_cust, histfit)
plot(exp(preddf), plot.type="single", col = c("black","red"))